In [1]:
import pandas as pd
from pandas.tseries.offsets import DateOffset
import requests
from bs4 import BeautifulSoup
import json
from collections import Counter
from matplotlib import pyplot as plt
from datetime import datetime
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
import re
import nltk

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
In [3]:
# Step 0: Load cleaned data from GitHub
import pandas as pd

# Replace the date with the correct one if needed
url = "https://raw.githubusercontent.com/Neeti3107/Foundation-Project_Group-14/main/data/cleaned/retail_sugar_prices_2025-04-15.csv"

# Load the cleaned file
df_filtered = pd.read_csv(url, parse_dates=['date'])

# Quick preview
df_filtered.head()
Out[3]:
date admin1 admin2 market market_id latitude longitude category commodity commodity_id unit priceflag pricetype currency price usdprice
0 1994-01-15 Gujarat Ahmadabad Ahmedabad 923 23.03 72.62 miscellaneous food sugar 97 KG actual retail INR 13.5 0.43
1 1994-01-15 Karnataka Bangalore Urban Bengaluru 926 12.96 77.58 miscellaneous food sugar 97 KG actual retail INR 13.2 0.42
2 1994-01-15 Maharashtra Mumbai city Mumbai 955 18.98 72.83 miscellaneous food sugar 97 KG actual retail INR 13.8 0.44
3 1994-01-15 Orissa Khordha Bhubaneshwar 929 20.23 85.83 miscellaneous food sugar 97 KG actual retail INR 13.5 0.43
4 1994-01-15 Tripura West Tripura Agartala 921 23.84 91.28 miscellaneous food sugar 97 KG actual retail INR 16.0 0.51
In [4]:
import pandas as pd
import requests
import re
from datetime import datetime

def get_latest_cleaned_csv_url(user, repo, path="data/cleaned"):
    api_url = f"https://api.github.com/repos/{user}/{repo}/contents/{path}"
    response = requests.get(api_url)
    files = response.json()

    csv_files = []
    for file in files:
        name = file['name']
        if name.startswith("retail_sugar_prices_") and name.endswith(".csv"):
            match = re.search(r"(\d{4}-\d{2}-\d{2})", name)
            if match:
                csv_files.append((match.group(1), name))

    if not csv_files:
        raise ValueError("❌ No cleaned CSV files found.")

    # Get latest date
    latest_date, latest_file = sorted(csv_files)[-1]
    print(f"📁 Latest file found: {latest_file}")

    raw_url = f"https://raw.githubusercontent.com/{user}/{repo}/main/{path}/{latest_file}"
    return raw_url

# === USE IT ===
user = "Neeti3107"
repo = "Foundation-Project_Group-14"
latest_csv_url = get_latest_cleaned_csv_url(user, repo)

# Load the file
df_filtered = pd.read_csv(latest_csv_url, parse_dates=['date'])
df_filtered.head()
📁 Latest file found: retail_sugar_prices_2025-04-16.csv
Out[4]:
date admin1 admin2 market market_id latitude longitude category commodity commodity_id unit priceflag pricetype currency price usdprice
0 1994-01-15 Gujarat Ahmadabad Ahmedabad 923 23.03 72.62 miscellaneous food sugar 97 KG actual retail INR 13.5 0.43
1 1994-01-15 Karnataka Bangalore Urban Bengaluru 926 12.96 77.58 miscellaneous food sugar 97 KG actual retail INR 13.2 0.42
2 1994-01-15 Maharashtra Mumbai city Mumbai 955 18.98 72.83 miscellaneous food sugar 97 KG actual retail INR 13.8 0.44
3 1994-01-15 Orissa Khordha Bhubaneshwar 929 20.23 85.83 miscellaneous food sugar 97 KG actual retail INR 13.5 0.43
4 1994-01-15 Tripura West Tripura Agartala 921 23.84 91.28 miscellaneous food sugar 97 KG actual retail INR 16.0 0.51
In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- EDA and Visualizations ---
# Step 1: Sort and create a copy
df_eda = df_filtered[['date', 'price']].copy()
df_eda = df_eda.sort_values('date')

# Step 2: Add Month and Year columns
df_eda['year'] = df_eda['date'].dt.year
df_eda['month'] = df_eda['date'].dt.to_period('M')

# Step 3: Percentage change
df_eda['pct_change'] = df_eda['price'].pct_change() * 100

# Step 4: Monthly average price
df_monthly = df_eda.groupby('month')['price'].mean().reset_index()
df_monthly['month'] = df_monthly['month'].dt.to_timestamp()

# Step 5: Line plot of monthly average price
plt.figure(figsize=(14, 6))
sns.lineplot(data=df_monthly, x='month', y='price', marker='o')
plt.title("Monthly Average Sugar Price (INR)")
plt.xlabel("Month")
plt.ylabel("Average Price (INR)")
plt.grid(True)
plt.tight_layout()
plt.show()

# Step 6: Monthly percentage change
df_monthly['pct_change'] = df_monthly['price'].pct_change() * 100

plt.figure(figsize=(14, 6))
sns.barplot(data=df_monthly, x='month', y='pct_change', color='orange')
plt.title("Monthly Percentage Change in Sugar Price")
plt.xlabel("Month")
plt.ylabel("Percentage Change")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


# Step 7: Price histogram
plt.figure(figsize=(10, 6))
sns.histplot(df_eda['price'], bins=30, kde=True)
plt.title("Distribution of Sugar Prices (Retail)")
plt.xlabel("Price (INR)")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

# Step 8: Boxplot by year
plt.figure(figsize=(14, 6))
sns.boxplot(x='year', y='price', data=df_eda)
plt.title("Year-wise Distribution of Sugar Prices (Retail)")
plt.xlabel("Year")
plt.ylabel("Price (INR)")
plt.grid(True)
plt.tight_layout()
plt.show()

# Step 9: Heatmap of average price by year and month
df_eda['month_num'] = df_eda['date'].dt.month
pivot_table = df_eda.pivot_table(values='price', index='year', columns='month_num', aggfunc='mean')

plt.figure(figsize=(12, 6))
sns.heatmap(pivot_table, cmap='YlGnBu', annot=True, fmt=".1f")
plt.title("Monthly Sugar Prices Heatmap (Retail, INR)")
plt.xlabel("Month")
plt.ylabel("Year")
plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [6]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd

# Ensure 'month' is datetime and sorted
df_monthly = df_monthly.sort_values('month')

# Plot using Matplotlib to keep datetime on x-axis
fig, ax = plt.subplots(figsize=(14, 6))
ax.bar(df_monthly['month'], df_monthly['pct_change'], color='orange')

# Formatting
ax.set_title("Monthly Percentage Change in Sugar Price")
ax.set_xlabel("Year")
ax.set_ylabel("Percentage Change")

# Set x-axis ticks every 5 years from Jan 1994
locator = mdates.YearLocator(base=5, month=1, day=1)
formatter = mdates.DateFormatter('%Y')
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)

# Set limits from Jan 1994 to last available date
start_date = pd.Timestamp('1994-01-01')
end_date = df_monthly['month'].max()
ax.set_xlim([start_date, end_date])

plt.xticks(rotation=0)
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [7]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

# Create time series with datetime index
df_decompose = df_monthly[['month', 'price']].copy()
df_decompose.set_index('month', inplace=True)
df_decompose = df_decompose.sort_index()

# Optional: Interpolate or drop missing values
df_decompose = df_decompose.interpolate()

# Specify period manually (12 for monthly data)
decompose_result = seasonal_decompose(df_decompose['price'], model='additive', period=12)

# Plot the full decomposition
decompose_result.plot()
plt.suptitle("Seasonal Decomposition of Sugar Prices (Additive Model)", fontsize=16)
plt.tight_layout()
plt.show()

# Plot trend separately
plt.figure(figsize=(16, 6))
decompose_result.trend.plot(color='orange', title='Trend Component')
plt.grid(True)
plt.show()

# Plot seasonality separately
plt.figure(figsize=(16, 6))
decompose_result.seasonal.plot(color='green', title='Seasonal Component')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

📊 Insights from EDA of Retail Sugar Prices (1994–2025)¶


🧭 1. Overall Trends (Long-Term Behavior)¶

  • Sugar prices in India have steadily increased over the last 30 years.
  • From around ₹8/kg in 1994, prices have climbed to over ₹45/kg by recent years.
  • A strong upward trend is clearly visible in the trend decomposition plot, especially during:
    • 2009–2011: Noticeable inflationary spike in sugar prices.
    • 2020–2022: Gradual increase likely influenced by pandemic-era disruptions.

📈 2. Monthly Average & Seasonality¶

  • There is seasonal fluctuation in sugar prices, repeating every year.
  • Prices tend to drop in early months (April–May) and rise again toward the year-end.
  • This pattern suggests:
    • A possible link to agricultural harvest cycles or festive demand variations.
    • E.g., Diwali or end-of-year celebrations may create demand surges.

🧮 3. Monthly Percentage Change¶

  • Most months show mild percentage changes (<5%), but a few months spike or drop sharply:
    • These may correspond to policy changes, import/export controls, or supply shocks.
    • Sudden changes often align with known economic events or weather-related impacts.

📦 4. Distribution Insights¶

  • The most common sugar price historically was ₹8, which occurred 21 times — mostly in early years.
  • Overall, sugar prices follow a right-skewed distribution with a concentration of values between ₹20–₹40/kg.
  • This skew indicates gradual but consistent inflation in consumer sugar pricing.

📅 5. Year-wise Variability¶

  • The boxplot shows wider price ranges in later years (post-2010), suggesting:
    • Increased volatility in the market.
    • Possibly driven by global market influences, fuel costs, or climate-driven variability.

🌡️ 6. Heatmap View¶

  • The heatmap confirms a repeating seasonal cycle:
    • Sugar prices are lower in mid-year months and higher towards the end/start of each year.
    • 2009, 2016, and 2020 stand out with unusual spikes, possibly due to external shocks.

In [8]:
# Check Which States Have the Most Complete Sugar Price Data

# Add year-month for aggregation
df_filtered['year_month'] = df_filtered['date'].dt.to_period('M').dt.to_timestamp()

# Count how many months of data are available per state
state_monthly_counts = (
    df_filtered.groupby(['admin1', 'year_month'])['price']
    .count()
    .reset_index(name='count')
)

# Pivot to create heatmap-like matrix: rows=state, columns=month, values=presence (0/1)
state_presence = state_monthly_counts.copy()
state_presence['present'] = 1  # Mark available data
heatmap_df = state_presence.pivot(index='admin1', columns='year_month', values='present').fillna(0)

# Optional: sort states by number of months available
heatmap_df['total_months'] = heatmap_df.sum(axis=1)
heatmap_df = heatmap_df.sort_values('total_months', ascending=False).drop(columns='total_months')

# Plot heatmap
plt.figure(figsize=(16, 10))
sns.heatmap(heatmap_df, cmap='Greens', cbar=False, linewidths=0.1)
plt.title("📊 State-wise Monthly Data Completeness for Retail Sugar Prices")
plt.xlabel("Year-Month")
plt.ylabel("State (admin1)")
plt.tight_layout()
plt.show()

# Print summary table: states with most and least data
summary = (
    df_filtered.groupby('admin1')['year_month']
    .nunique()
    .sort_values(ascending=False)
    .reset_index(name='months_of_data')
)
print("Top states by data completeness:\n", summary.head(10))
print("\nBottom states by data completeness:\n", summary.tail(10))
C:\Users\neeti\AppData\Local\Temp\ipykernel_11328\757177853.py:28: UserWarning: Glyph 128202 (\N{BAR CHART}) missing from current font.
  plt.tight_layout()
C:\Users\neeti\anaconda3\Lib\site-packages\IPython\core\pylabtools.py:170: UserWarning: Glyph 128202 (\N{BAR CHART}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
Top states by data completeness:
              admin1  months_of_data
0       Maharashtra             257
1        Tamil Nadu             251
2         Rajasthan             241
3            Orissa             240
4  Himachal Pradesh             231
5     Uttar Pradesh             230
6         Karnataka             223
7    Madhya Pradesh             219
8            Kerala             217
9             Bihar             215

Bottom states by data completeness:
                  admin1  months_of_data
21          Uttarakhand             112
22       Andhra Pradesh             111
23  Andaman and Nicobar              93
24           Chandigarh              92
25             Nagaland              92
26           Puducherry              79
27                  Goa              72
28         Chhattisgarh              34
29               Sikkim              21
30              Manipur              13
In [9]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Create pivot table (rows = date, columns = states)
pivot_prices = (
    df_filtered.groupby(['year_month', 'admin1'])['price']
    .mean()
    .unstack()
)

# Step 2: Interpolate missing values (linear within time)
pivot_prices_interp = pivot_prices.interpolate(method='linear', limit_direction='both')

# Optional: Only keep states with at least 180 months (15 years) of data
valid_states = pivot_prices_interp.columns[pivot_prices_interp.notna().sum() >= 180]
pivot_prices_interp = pivot_prices_interp[valid_states]

# Step 3: Standardize (each state's time series gets mean=0, std=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(pivot_prices_interp.fillna(0).T)  # Transpose: states as rows

# Step 4: Apply KMeans clustering (you can tune 'n_clusters')
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster labels to state names
cluster_df = pd.DataFrame({
    'state': pivot_prices_interp.columns,
    'cluster': clusters
}).sort_values('cluster')

print(cluster_df)

# Step 5: Optional PCA for 2D visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette='Set2', s=100)
for i, state in enumerate(pivot_prices_interp.columns):
    plt.text(X_pca[i, 0], X_pca[i, 1], state, fontsize=9, ha='right')
plt.title("📍 State Clustering by Sugar Price Pattern (PCA Projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.tight_layout()
plt.show()
C:\Users\neeti\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\neeti\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
                  state  cluster
15          Maharashtra        0
28        Uttar Pradesh        0
26            Telangana        0
25           Tamil Nadu        0
23            Rajasthan        0
20               Orissa        0
14       Madhya Pradesh        0
13               Kerala        0
12            Karnataka        0
11            Jharkhand        0
30          West Bengal        0
8               Gujarat        0
6                 Delhi        0
2                 Assam        0
3                 Bihar        0
4            Chandigarh        0
10     Himachal Pradesh        0
0   Andaman and Nicobar        1
16              Manipur        1
24               Sikkim        1
1        Andhra Pradesh        2
22               Punjab        2
9               Haryana        2
29          Uttarakhand        2
7                   Goa        2
21           Puducherry        2
5          Chhattisgarh        2
19             Nagaland        3
18              Mizoram        3
17            Meghalaya        3
27              Tripura        3
C:\Users\neeti\AppData\Local\Temp\ipykernel_11328\1823660468.py:49: UserWarning: Glyph 128205 (\N{ROUND PUSHPIN}) missing from current font.
  plt.tight_layout()
C:\Users\neeti\anaconda3\Lib\site-packages\IPython\core\pylabtools.py:170: UserWarning: Glyph 128205 (\N{ROUND PUSHPIN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image

🧠 Cluster Interpretations¶


✅ Cluster 0 – Majority Group (Stable/Aligned Trend)¶

States: Maharashtra, Uttar Pradesh, Tamil Nadu, Rajasthan, Karnataka, Kerala, Madhya Pradesh, Gujarat, etc.

  • These states show similar seasonal patterns and consistent data availability.
  • Likely follow national sugar price trends.
  • Suitable for country-level modeling or selecting a representative sample group.

🌴 Cluster 1 – Sparse/Irregular or Island Territories¶

States: Andaman & Nicobar, Manipur, Sikkim

  • Often have sparse data or irregular price patterns.
  • May have unique supply chains (e.g., non-agricultural or import-dependent).
  • Not ideal for standard trend modeling without adjustment.

⚡ Cluster 2 – Semi-distinct / Volatile Trends¶

States: Andhra Pradesh, Punjab, Haryana, Goa, Puducherry, Chhattisgarh

  • Show greater price volatility or regional fluctuations.
  • Could be influenced by local governance, infrastructure, or supply-demand issues.
  • May need separate forecasting models or volatility handling.

🏔️ Cluster 3 – Northeast Focused Outliers¶

States: Nagaland, Mizoram, Meghalaya, Tripura

  • Exhibit distinctive pricing behavior compared to mainland states.
  • Influenced by transport costs, geography, or border trade policies.
  • Important to treat as a unique regional group in analysis.

In [10]:
pip install plotly geopandas
Requirement already satisfied: plotly in c:\users\neeti\anaconda3\lib\site-packages (5.22.0)Note: you may need to restart the kernel to use updated packages.

Requirement already satisfied: geopandas in c:\users\neeti\anaconda3\lib\site-packages (1.0.1)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\neeti\anaconda3\lib\site-packages (from plotly) (8.2.2)
Requirement already satisfied: packaging in c:\users\neeti\anaconda3\lib\site-packages (from plotly) (23.2)
Requirement already satisfied: numpy>=1.22 in c:\users\neeti\anaconda3\lib\site-packages (from geopandas) (1.26.4)
Requirement already satisfied: pyogrio>=0.7.2 in c:\users\neeti\anaconda3\lib\site-packages (from geopandas) (0.10.0)
Requirement already satisfied: pandas>=1.4.0 in c:\users\neeti\anaconda3\lib\site-packages (from geopandas) (2.2.2)
Requirement already satisfied: pyproj>=3.3.0 in c:\users\neeti\anaconda3\lib\site-packages (from geopandas) (3.7.1)
Requirement already satisfied: shapely>=2.0.0 in c:\users\neeti\anaconda3\lib\site-packages (from geopandas) (2.1.0)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\neeti\anaconda3\lib\site-packages (from pandas>=1.4.0->geopandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in c:\users\neeti\anaconda3\lib\site-packages (from pandas>=1.4.0->geopandas) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in c:\users\neeti\anaconda3\lib\site-packages (from pandas>=1.4.0->geopandas) (2023.3)
Requirement already satisfied: certifi in c:\users\neeti\anaconda3\lib\site-packages (from pyogrio>=0.7.2->geopandas) (2024.7.4)
Requirement already satisfied: six>=1.5 in c:\users\neeti\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=1.4.0->geopandas) (1.16.0)
In [11]:
import pandas as pd
import plotly.express as px
import json
import requests

# Your state-cluster data
state_cluster_df = pd.DataFrame({
    'state': [
        'Maharashtra', 'Uttar Pradesh', 'Telangana', 'Tamil Nadu', 'Rajasthan',
        'Orissa', 'Madhya Pradesh', 'Kerala', 'Karnataka', 'Jharkhand',
        'West Bengal', 'Gujarat', 'Delhi', 'Assam', 'Bihar', 'Chandigarh',
        'Himachal Pradesh', 'Andaman and Nicobar', 'Manipur', 'Sikkim',
        'Andhra Pradesh', 'Punjab', 'Haryana', 'Uttarakhand', 'Goa',
        'Puducherry', 'Chhattisgarh', 'Nagaland', 'Mizoram', 'Meghalaya', 'Tripura'
    ],
    'cluster': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3]
})

# Load India states GeoJSON
# Source: https://github.com/geohacker/india/blob/master/state/india_telengana.geojson
url = 'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json'  # For US by default
india_geojson_url = 'https://raw.githubusercontent.com/geohacker/india/master/state/india_telengana.geojson'
india_states = requests.get(india_geojson_url).json()

# Fix names to match if needed
state_cluster_df['state'] = state_cluster_df['state'].str.title()

# Plotly Choropleth
fig = px.choropleth_mapbox(
    state_cluster_df,
    geojson=india_states,
    featureidkey='properties.NAME_1',
    locations='state',
    color='cluster',
    color_continuous_scale='Viridis',
    mapbox_style='carto-positron',
    center={"lat": 23.5937, "lon": 80.9629},
    zoom=3.8,
    opacity=0.7,
    title="State-wise Clustering Based on Sugar Price Patterns"
)

fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
fig.show()
In [12]:
# === 1. Create the notebook folder if not exists
import os  # 👈 required for path operations
import pandas as pd
import os
from datetime import datetime
from pandas_profiling import ProfileReport
import subprocess

repo_path = r"C:\Users\neeti\Documents\ISB_Class of Summer_2025\04 Term 4\Foundation\Foundation-Project_Group-14"
notebook_path = os.path.join(repo_path, "notebooks")
os.makedirs(notebook_path, exist_ok=True)

# === 2. Push to GitHub
os.chdir(repo_path)
try:
    subprocess.run(["git", "add", "notebooks/EDA_Report.ipynb"], check=True)
    subprocess.run(["git", "commit", "-m", "Automated: Added EDA report notebook"], check=True)
    subprocess.run(["git", "push", "origin", "main"], check=True)
    print("🚀 EDA notebook committed and pushed to GitHub.")
except subprocess.CalledProcessError as e:
    print(f"❌ Git push failed: {e}")
C:\Users\neeti\AppData\Local\Temp\ipykernel_11328\3199605812.py:6: DeprecationWarning:

`import pandas_profiling` is going to be deprecated by April 1st. Please use `import ydata_profiling` instead.

🚀 EDA notebook committed and pushed to GitHub.
In [13]:
# 📦 Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX

# 🧹 Step 1: Aggregate Monthly Country-level Data
df_filtered['month'] = df_filtered['date'].dt.to_period('M')
df_country = df_filtered.groupby('month')['price'].mean().reset_index()
df_country['month'] = df_country['month'].dt.to_timestamp()
df_country.set_index('month', inplace=True)


# 📆 Step 2: Train/Test Split
df_train = df_country[df_country.index < pd.to_datetime("2019-01-01")]
df_test = df_country[df_country.index >= pd.to_datetime("2019-01-01")]

# 📊 Step 3: Plot
plt.figure(figsize=(12, 6))
plt.plot(df_train.index, df_train['price'], label='Train', color='black')
plt.plot(df_test.index, df_test['price'], label='Test', color='red')
plt.xlabel("Date")
plt.ylabel("Sugar Price")
plt.title("Train/Test Split - Country-Level Sugar Price")
plt.legend()
plt.grid(True)
plt.show()

# 🧪 Step 4: ADFuller Test for Stationarity (on full data)
print("=== ADF Test on Original Series ===")
adf_result = adfuller(df_country['price'])
print(f"ADF Statistic : {adf_result[0]:.4f}")
print(f"p-value : {adf_result[1]:.4f}")
print(f"Lags Used : {adf_result[2]}")
print("Critical Values :")
for key, value in adf_result[4].items():
    print(f"    {key}: {value:.4f}")

# 🌀 Step 5: Differencing with Rolling Mean (if non-stationary)
rolling_mean = df_country['price'].rolling(window=12).mean()
df_country['rolling_mean_diff'] = rolling_mean - rolling_mean.shift()

# 📉 Step 6: ADF Test on Differenced Series
print("\n=== ADF Test on Rolling Mean Differenced Series ===")
adf_result_diff = adfuller(df_country['rolling_mean_diff'].dropna())
print(f"ADF Statistic : {adf_result_diff[0]:.4f}")
print(f"p-value : {adf_result_diff[1]:.4f}")
print(f"Lags Used : {adf_result_diff[2]}")
print("Critical Values :")
for key, value in adf_result_diff[4].items():
    print(f"    {key}: {value:.4f}")
No description has been provided for this image
=== ADF Test on Original Series ===
ADF Statistic : -0.9168
p-value : 0.7824
Lags Used : 2
Critical Values :
    1%: -3.4508
    5%: -2.8706
    10%: -2.5716

=== ADF Test on Rolling Mean Differenced Series ===
ADF Statistic : -4.6970
p-value : 0.0001
Lags Used : 16
Critical Values :
    1%: -3.4526
    5%: -2.8714
    10%: -2.5720

🧪 ADF Test Insights: Stationarity Check of Sugar Prices¶

📉 Original Series (Non-Stationary)¶

  • ADF Statistic: -0.9168
  • p-value: 0.7824
  • Conclusion:
    • The p-value is greater than 0.05, so we fail to reject the null hypothesis.
    • This implies that the sugar price time series is non-stationary in its original form.
    • This is expected, as sugar prices show a long-term upward trend and seasonal behavior.

🔁 Transformed Series (Rolling Mean Differenced)¶

  • ADF Statistic: -4.6970
  • p-value: 0.0001
  • Conclusion:
    • The p-value is less than 0.05, so we reject the null hypothesis.
    • The rolling mean differenced series is stationary.
    • The transformation effectively removed trend and seasonality, making it suitable for time series modeling.

✅ Summary Table¶

Series ADF Statistic p-value Stationary?
Original -0.9168 0.7824 ❌ No
Rolling Mean Differenced -4.6970 0.0001 ✅ Yes

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Copy full monthly country-level data
df_data = df_country.copy()

# Train-test split (you can change split date as needed)
split_date = pd.to_datetime("2019-01-01")
df_train = df_data[df_data.index < split_date]
df_test = df_data[df_data.index >= split_date]
df_test_copy = df_test.copy()

# ============================================
# ⚙️ ARMA Model on Original Series (Price)
# ============================================
print("Fitting ARMA(11,0,11) model on original price series...")

# Fit ARMA on original price series
arma_model_price = SARIMAX(df_train['price'], order=(11, 0, 11))
arma_model_price = arma_model_price.fit(disp=False)

# Predict using relative integer positions
start_idx = len(df_train)
end_idx = start_idx + len(df_test) - 1
arma_forecast_price = arma_model_price.predict(start=start_idx, end=end_idx, dynamic=True)

# Store forecast in test copy
df_test_copy['arma_forecast_original'] = arma_forecast_price.values

# Plot forecast vs actual
plt.figure(figsize=(12, 6))
plt.plot(df_data.index, df_data['price'], label='Actual Price', color='black')
plt.plot(df_test_copy.index, df_test_copy['arma_forecast_original'], label='ARMA Forecast', color='red')
plt.title("ARMA Forecast on Original Series")
plt.xlabel("Date")
plt.ylabel("Sugar Price")
plt.legend()
plt.grid(True)
plt.show()

# ============================================================
# ⚙️ ARMA Model on Rolling Mean Differenced (Stationary) Series
# ============================================================
print("Fitting ARMA(11,0,11) model on stationary rolling mean differenced series...")

# Create 12-month rolling mean differenced series
df_data['rolling_mean'] = df_data['price'].rolling(window=12).mean()
df_data['rolling_mean_diff'] = df_data['rolling_mean'] - df_data['rolling_mean'].shift()
rolling_diff = df_data['rolling_mean_diff'].dropna()

# Fit ARMA on differenced series
arma_model_rolling = SARIMAX(rolling_diff, order=(11, 0, 11))
arma_model_rolling = arma_model_rolling.fit(disp=False)

# Forecast using relative positions
start_idx_diff = len(rolling_diff) - len(df_test)
end_idx_diff = start_idx_diff + len(df_test) - 1
arma_forecast_diff = arma_model_rolling.predict(start=start_idx_diff, end=end_idx_diff, dynamic=True)

# Align forecast by index and store
forecast_index = rolling_diff.index[start_idx_diff:end_idx_diff + 1]
df_data.loc[forecast_index, 'arma_forecast_diff'] = arma_forecast_diff.values

# Plot forecast on stationary differenced series
plt.figure(figsize=(12, 6))
plt.plot(df_data['rolling_mean_diff'], label='Differenced Series', color='gray')
plt.plot(forecast_index, arma_forecast_diff, label='ARMA Forecast (Diff)', color='blue')
plt.title("ARMA Forecast on Stationary Differenced Series")
plt.xlabel("Date")
plt.ylabel("Rolling Mean Differenced Price")
plt.legend()
plt.grid(True)
plt.show()
Fitting ARMA(11,0,11) model on original price series...
C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning:

Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning:

Non-invertible starting MA parameters found. Using zeros as starting parameters.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning:

Maximum Likelihood optimization failed to converge. Check mle_retvals

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning:

No supported index is available. Prediction results will be given with an integer index beginning at `start`.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning:

No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.

No description has been provided for this image
Fitting ARMA(11,0,11) model on stationary rolling mean differenced series...
C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning:

Maximum Likelihood optimization failed to converge. Check mle_retvals

No description has been provided for this image
In [15]:
print(arma_model_price.summary())
                               SARIMAX Results                                
==============================================================================
Dep. Variable:                  price   No. Observations:                  252
Model:             SARIMAX(11, 0, 11)   Log Likelihood                -351.994
Date:                Wed, 16 Apr 2025   AIC                            749.988
Time:                        14:21:48   BIC                            831.165
Sample:                             0   HQIC                           782.652
                                - 252                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.3476      4.818     -0.072      0.942      -9.792       9.096
ar.L2          0.1592      1.164      0.137      0.891      -2.123       2.441
ar.L3          0.1139      0.901      0.126      0.899      -1.652       1.880
ar.L4          0.0723      0.606      0.119      0.905      -1.115       1.260
ar.L5          0.1015      0.300      0.338      0.735      -0.487       0.690
ar.L6          0.0266      0.497      0.053      0.957      -0.947       1.001
ar.L7          0.0394      0.216      0.182      0.856      -0.385       0.464
ar.L8         -0.0952      0.295     -0.323      0.747      -0.673       0.482
ar.L9          0.2216      0.609      0.364      0.716      -0.971       1.415
ar.L10         0.6469      1.144      0.566      0.572      -1.595       2.889
ar.L11         0.0554      2.906      0.019      0.985      -5.641       5.752
ma.L1          1.2951      4.850      0.267      0.789      -8.210      10.800
ma.L2          1.4135      5.738      0.246      0.805      -9.832      12.659
ma.L3          1.4061      6.198      0.227      0.821     -10.741      13.554
ma.L4          1.4861      5.974      0.249      0.804     -10.222      13.194
ma.L5          1.3797      6.439      0.214      0.830     -11.240      13.999
ma.L6          1.2182      5.836      0.209      0.835     -10.221      12.657
ma.L7          1.1581      5.108      0.227      0.821      -8.853      11.169
ma.L8          1.2204      4.798      0.254      0.799      -8.184      10.625
ma.L9          0.8088      5.223      0.155      0.877      -9.427      11.045
ma.L10         0.0777      3.171      0.024      0.980      -6.136       6.292
ma.L11         0.0254      0.119      0.213      0.831      -0.208       0.259
sigma2         0.9120      0.122      7.491      0.000       0.673       1.151
===================================================================================
Ljung-Box (L1) (Q):                   0.00   Jarque-Bera (JB):               347.58
Prob(Q):                              0.98   Prob(JB):                         0.00
Heteroskedasticity (H):               2.34   Skew:                             0.11
Prob(H) (two-sided):                  0.00   Kurtosis:                         8.75
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# =============================
# 📉 Errors on Original Series
# =============================
actual_original = df_test['price']
predicted_original = df_test_copy['arma_forecast_original']

rmse_original = np.sqrt(mean_squared_error(actual_original, predicted_original))
mae_original = mean_absolute_error(actual_original, predicted_original)

print("=== Forecast Errors (Original Series) ===")
print(f"RMSE: {rmse_original:.4f}")
print(f"MAE : {mae_original:.4f}")


# ==================================================
# 📉 Errors on Rolling Mean Differenced Series
# Note: This is for evaluating differenced forecast
# ==================================================
# Align actual and predicted values
actual_diff = df_data.loc[forecast_index, 'rolling_mean_diff']
predicted_diff = df_data.loc[forecast_index, 'arma_forecast_diff']

rmse_diff = np.sqrt(mean_squared_error(actual_diff, predicted_diff))
mae_diff = mean_absolute_error(actual_diff, predicted_diff)

print("\n=== Forecast Errors (Differenced Series) ===")
print(f"RMSE: {rmse_diff:.4f}")
print(f"MAE : {mae_diff:.4f}")
=== Forecast Errors (Original Series) ===
RMSE: 6.6685
MAE : 6.0780

=== Forecast Errors (Differenced Series) ===
RMSE: 0.1611
MAE : 0.1276
In [17]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Copy full monthly country-level data
df_data = df_country.copy()

# Train-test split
split_date = pd.to_datetime("2019-01-01")
df_train = df_data[df_data.index < split_date]
df_test = df_data[df_data.index >= split_date]
df_test_copy = df_test.copy()

# ============================================
# ⚙️ ARMA(2,0,2) Model on Original Series (Price)
# ============================================
print("Fitting ARMA(2,0,2) model on original price series...")

# Fit ARMA on original price series
arma_model_price = SARIMAX(df_train['price'], order=(2, 0, 2))
arma_model_price = arma_model_price.fit(disp=False)

# Forecast
start_idx = len(df_train)
end_idx = start_idx + len(df_test) - 1
arma_forecast_price = arma_model_price.predict(start=start_idx, end=end_idx, dynamic=True)

# Store forecast
df_test_copy['arma_forecast_original'] = arma_forecast_price.values

# Plot forecast vs actual
plt.figure(figsize=(12, 6))
plt.plot(df_data.index, df_data['price'], label='Actual Price', color='black')
plt.plot(df_test_copy.index, df_test_copy['arma_forecast_original'], label='ARMA(2,2) Forecast', color='red')
plt.title("ARMA(2,2) Forecast on Original Series")
plt.xlabel("Date")
plt.ylabel("Sugar Price")
plt.legend()
plt.grid(True)
plt.show()

# ============================================================
# ⚙️ ARMA(2,0,2) Model on Rolling Mean Differenced Series
# ============================================================
print("Fitting ARMA(2,0,2) model on stationary rolling mean differenced series...")

# Create 12-month rolling mean differenced series
df_data['rolling_mean'] = df_data['price'].rolling(window=12).mean()
df_data['rolling_mean_diff'] = df_data['rolling_mean'] - df_data['rolling_mean'].shift()
rolling_diff = df_data['rolling_mean_diff'].dropna()

# Fit ARMA on differenced series
arma_model_rolling = SARIMAX(rolling_diff, order=(2, 0, 2))
arma_model_rolling = arma_model_rolling.fit(disp=False)

# Forecast
start_idx_diff = len(rolling_diff) - len(df_test)
end_idx_diff = start_idx_diff + len(df_test) - 1
arma_forecast_diff = arma_model_rolling.predict(start=start_idx_diff, end=end_idx_diff, dynamic=True)

# Align forecast
forecast_index = rolling_diff.index[start_idx_diff:end_idx_diff + 1]
df_data.loc[forecast_index, 'arma_forecast_diff'] = arma_forecast_diff.values

# Plot forecast on stationary differenced series
plt.figure(figsize=(12, 6))
plt.plot(df_data['rolling_mean_diff'], label='Differenced Series', color='gray')
plt.plot(forecast_index, arma_forecast_diff, label='ARMA(2,2) Forecast (Diff)', color='blue')
plt.title("ARMA(2,2) Forecast on Stationary Differenced Series")
plt.xlabel("Date")
plt.ylabel("Rolling Mean Differenced Price")
plt.legend()
plt.grid(True)
plt.show()
Fitting ARMA(2,0,2) model on original price series...
C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning:

Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning:

No supported index is available. Prediction results will be given with an integer index beginning at `start`.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning:

No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.

No description has been provided for this image
Fitting ARMA(2,0,2) model on stationary rolling mean differenced series...
C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

No description has been provided for this image
In [18]:
# ================
# Evaluation: Original Series Forecast
# ================
rmse_original = np.sqrt(mean_squared_error(df_test['price'], df_test_copy['arma_forecast_original']))
mae_original = mean_absolute_error(df_test['price'], df_test_copy['arma_forecast_original'])

print("\n🔍 ARMA(2,2) on Original Price Series:")
print(f"RMSE: {rmse_original:.4f}")
print(f"MAE : {mae_original:.4f}")

# ============================
# Evaluation: Differenced Series Forecast
# ============================

# Drop NA to align ground truth and forecast
true_diff = df_data.loc[forecast_index, 'rolling_mean_diff'].dropna()
pred_diff = df_data.loc[forecast_index, 'arma_forecast_diff'].dropna()

# Ensure alignment
true_diff, pred_diff = true_diff.align(pred_diff, join='inner')

rmse_diff = np.sqrt(mean_squared_error(true_diff, pred_diff))
mae_diff = mean_absolute_error(true_diff, pred_diff)

print("\n🔍 ARMA(2,2) on Rolling Mean Differenced Series:")
print(f"RMSE: {rmse_diff:.4f}")
print(f"MAE : {mae_diff:.4f}")
🔍 ARMA(2,2) on Original Price Series:
RMSE: 6.3573
MAE : 5.6535

🔍 ARMA(2,2) on Rolling Mean Differenced Series:
RMSE: 0.1621
MAE : 0.1216
In [19]:
print(arma_model_price.summary())
                               SARIMAX Results                                
==============================================================================
Dep. Variable:                  price   No. Observations:                  252
Model:               SARIMAX(2, 0, 2)   Log Likelihood                -367.543
Date:                Wed, 16 Apr 2025   AIC                            745.087
Time:                        14:21:49   BIC                            762.734
Sample:                             0   HQIC                           752.188
                                - 252                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.4461      0.161      2.778      0.005       0.131       0.761
ar.L2          0.5526      0.161      3.430      0.001       0.237       0.868
ma.L1          0.5252      0.163      3.224      0.001       0.206       0.845
ma.L2          0.1988      0.047      4.195      0.000       0.106       0.292
sigma2         1.0538      0.053     19.698      0.000       0.949       1.159
===================================================================================
Ljung-Box (L1) (Q):                   0.00   Jarque-Bera (JB):               426.36
Prob(Q):                              0.94   Prob(JB):                         0.00
Heteroskedasticity (H):               3.12   Skew:                             0.30
Prob(H) (two-sided):                  0.00   Kurtosis:                         9.34
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

🔍 Interpretation of ARMA(2,2) Model on Sugar Prices¶

The ARMA(2,2) model captures the relationship between current sugar prices, their past values, and recent forecast errors. The model estimates are as follows:

✅ Model Coefficients:¶

  • AR(1) = 0.4461: The current price is positively influenced by 44.6% of the previous month's price.
  • AR(2) = 0.5526: There is a 55.3% influence from the price two months ago.
  • MA(1) = 0.5252: The model adjusts for 52.5% of the previous forecast error.
  • MA(2) = 0.1988: It also adjusts for 19.9% of the forecast error from two months ago.

All coefficients are statistically significant (p < 0.01), indicating that they contribute meaningfully to the model.

📊 Model Diagnostics:¶

  • AIC = 745.1: Lower than more complex models like ARMA(11,11), suggesting a more efficient fit.
  • Ljung-Box Test (p = 0.94): No autocorrelation in residuals — the model captures temporal dependencies well.
  • Jarque-Bera Test (p ≈ 0.00): Residuals deviate from normality, indicating some non-Gaussian behavior.

📉 Forecast Accuracy:¶

Model Type RMSE MAE
Original Price Series 6.3573 5.6535
Rolling Mean Differenced Series 0.1621 0.1216

The forecast errors are notably lower for the rolling mean differenced series, reflecting that differencing made the series more stationary and predictable. However, the forecasted values in this case are not in the original price scale, and interpreting them requires reversing the differencing step.

📌 Summary:¶

The ARMA(2,2) model provides a statistically robust and interpretable structure for modeling monthly sugar prices. It strikes a balance between model complexity and forecasting performance. While it slightly underperforms on the original series compared to more complex alternatives, it is a strong candidate due to its simplicity and significance of parameters.

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Copy your monthly country-level data
df_data = df_country.copy()

# Train-test split
split_date = pd.to_datetime("2019-01-01")
df_train = df_data[df_data.index < split_date]
df_test = df_data[df_data.index >= split_date]
df_test_copy = df_test.copy()

# ======================================================
# 🔁 ARIMA(2,1,2) on Original Price Series
# ======================================================
print("Fitting ARIMA(2,1,2) on original price series...")

# Fit ARIMA model on original price series
arima_model = SARIMAX(df_train['price'], order=(2, 1, 2))
arima_result = arima_model.fit(disp=False)

# Forecast using relative positions
start_idx = len(df_train)
end_idx = start_idx + len(df_test) - 1
forecast_arima = arima_result.predict(start=start_idx, end=end_idx, dynamic=True)

# Save forecast
df_test_copy['arima_forecast_original'] = forecast_arima.values

# Plot
plt.figure(figsize=(12, 6))
plt.plot(df_data.index, df_data['price'], label='Actual Price', color='black')
plt.plot(df_test_copy.index, df_test_copy['arima_forecast_original'], label='ARIMA Forecast', color='orange')
plt.title("ARIMA(2,1,2) Forecast on Original Series")
plt.xlabel("Date")
plt.ylabel("Sugar Price")
plt.legend()
plt.grid(True)
plt.show()

# Compute error metrics
rmse_orig = np.sqrt(mean_squared_error(df_test['price'], df_test_copy['arima_forecast_original']))
mae_orig = mean_absolute_error(df_test['price'], df_test_copy['arima_forecast_original'])

print(f"🔍 ARIMA(2,1,2) on Original Price Series:\nRMSE: {rmse_orig:.4f}\nMAE : {mae_orig:.4f}")


# ================================================================
# 🔁 ARIMA(2,1,2) on 12-Month Rolling Mean Differenced Series
# ================================================================
print("Fitting ARIMA(2,1,2) on rolling mean differenced series...")

# Create rolling mean differenced series
df_data['rolling_mean'] = df_data['price'].rolling(window=12).mean()
df_data['rolling_mean_diff'] = df_data['rolling_mean'] - df_data['rolling_mean'].shift()
rolling_diff = df_data['rolling_mean_diff'].dropna()

# Fit ARIMA model on stationary series
arima_model_roll = SARIMAX(rolling_diff, order=(2, 1, 2))
arima_result_roll = arima_model_roll.fit(disp=False)

# Forecast using relative positions
start_idx_diff = len(rolling_diff) - len(df_test)
end_idx_diff = start_idx_diff + len(df_test) - 1
forecast_arima_diff = arima_result_roll.predict(start=start_idx_diff, end=end_idx_diff, dynamic=True)

# Align index
forecast_index = rolling_diff.index[start_idx_diff:end_idx_diff + 1]
df_data.loc[forecast_index, 'arima_forecast_diff'] = forecast_arima_diff.values

# Plot
plt.figure(figsize=(12, 6))
plt.plot(df_data['rolling_mean_diff'], label='Rolling Mean Diff', color='gray')
plt.plot(forecast_index, forecast_arima_diff, label='ARIMA Forecast (Diff)', color='blue')
plt.title("ARIMA(2,1,2) Forecast on Differenced Rolling Mean Series")
plt.xlabel("Date")
plt.ylabel("Rolling Mean Differenced Price")
plt.legend()
plt.grid(True)
plt.show()

# Evaluate performance (dropna to align with test range)
true_diff = df_data.loc[forecast_index, 'rolling_mean_diff'].dropna()
forecast_diff = df_data.loc[forecast_index, 'arima_forecast_diff'].dropna()

rmse_diff = np.sqrt(mean_squared_error(true_diff, forecast_diff))
mae_diff = mean_absolute_error(true_diff, forecast_diff)

print(f"🔍 ARIMA(2,1,2) on Rolling Mean Differenced Series:\nRMSE: {rmse_diff:.4f}\nMAE : {mae_diff:.4f}")
Fitting ARIMA(2,1,2) on original price series...
C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning:

No supported index is available. Prediction results will be given with an integer index beginning at `start`.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning:

No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.

No description has been provided for this image
🔍 ARIMA(2,1,2) on Original Price Series:
RMSE: 5.5426
MAE : 5.0165
Fitting ARIMA(2,1,2) on rolling mean differenced series...
C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

No description has been provided for this image
🔍 ARIMA(2,1,2) on Rolling Mean Differenced Series:
RMSE: 0.4223
MAE : 0.4021

📌 ARIMA(2,1,2) Model Insights¶

We applied the ARIMA(2,1,2) model on both the original sugar price series and the rolling mean differenced (stationary) version of the series. Below are the evaluation metrics and key observations.

🔍 ARIMA(2,1,2) on Original Price Series:¶

  • Root Mean Squared Error (RMSE): 5.5426
  • Mean Absolute Error (MAE): 5.0165

📝 Interpretation:
The ARIMA model incorporates one level of differencing to account for potential non-stationarity in the original price series. While it captures long-term trend components, the forecast deviates more from the actual prices compared to the ARMA(2,2) model. This could be due to:

  • Differencing removing valuable structure or seasonality in the data.
  • Remaining seasonal or cyclical patterns not addressed by the non-seasonal ARIMA model.

🔍 ARIMA(2,1,2) on Rolling Mean Differenced Series:¶

  • Root Mean Squared Error (RMSE): 0.4223
  • Mean Absolute Error (MAE): 0.4021

📝 Interpretation:
Applied to a transformed, rolling mean differenced series, the model fits the data better numerically. However, since the series is scaled and smoothed, this improvement doesn’t directly reflect performance on the original price level. While useful for trend detection, this model may not be as effective for actionable forecasting in absolute price terms.


💡 Key Takeaway:¶

Despite numerical improvements on the differenced series, the ARIMA(2,1,2) model underperforms the simpler ARMA(2,2) on the original price series. This suggests that:

  • Differencing may not be necessary for this dataset.
  • The series may be already stationary or nearly so.
  • A seasonal model (e.g., SARIMA) might better handle recurring patterns in the data.
In [21]:
print("\n=== ARIMA(2,1,2) Summary on Rolling Mean Differenced Series ===")
print(arima_result_roll.summary())
=== ARIMA(2,1,2) Summary on Rolling Mean Differenced Series ===
                               SARIMAX Results                                
==============================================================================
Dep. Variable:      rolling_mean_diff   No. Observations:                  313
Model:               SARIMAX(2, 1, 2)   Log Likelihood                 242.174
Date:                Wed, 16 Apr 2025   AIC                           -474.349
Time:                        14:21:50   BIC                           -455.634
Sample:                             0   HQIC                          -466.869
                                - 313                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.6672      0.037     44.798      0.000       1.594       1.740
ar.L2         -0.8271      0.035    -23.897      0.000      -0.895      -0.759
ma.L1         -1.7340      0.023    -76.849      0.000      -1.778      -1.690
ma.L2          0.9650      0.020     48.564      0.000       0.926       1.004
sigma2         0.0123      0.000     25.478      0.000       0.011       0.013
===================================================================================
Ljung-Box (L1) (Q):                   1.32   Jarque-Bera (JB):               589.04
Prob(Q):                              0.25   Prob(JB):                         0.00
Heteroskedasticity (H):               2.06   Skew:                            -0.68
Prob(H) (two-sided):                  0.00   Kurtosis:                         9.59
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Copy your monthly country-level data
df_data = df_country.copy()

# Create rolling mean and differenced series
df_data['rolling_mean'] = df_data['price'].rolling(window=12).mean()
df_data['rolling_mean_diff'] = df_data['rolling_mean'] - df_data['rolling_mean'].shift()
rolling_diff = df_data['rolling_mean_diff'].dropna()

# Train-test split
split_date = pd.to_datetime("2019-01-01")
df_train_orig = df_data[df_data.index < split_date]
df_test_orig = df_data[df_data.index >= split_date]
df_test_copy = df_test_orig.copy()

df_train_roll = rolling_diff[rolling_diff.index < split_date]
df_test_roll = rolling_diff[rolling_diff.index >= split_date]

# =====================================================
# 1️⃣ SARIMA on Original Price Series
# =====================================================
print("🔁 Fitting SARIMA(2,1,2)(1,1,1,12) on original price series...")

sarima_model_orig = SARIMAX(
    df_train_orig['price'],
    order=(2, 1, 2),
    seasonal_order=(1, 1, 1, 12),
    enforce_stationarity=False,
    enforce_invertibility=False
)

sarima_result_orig = sarima_model_orig.fit(disp=False)

# Forecast
start_idx_orig = len(df_train_orig)
end_idx_orig = start_idx_orig + len(df_test_orig) - 1
forecast_sarima_orig = sarima_result_orig.predict(start=start_idx_orig, end=end_idx_orig, dynamic=True)

# Save forecast
df_test_copy['sarima_forecast_original'] = forecast_sarima_orig.values

# Plot original
plt.figure(figsize=(12, 6))
plt.plot(df_data.index, df_data['price'], label='Actual Price', color='black')
plt.plot(df_test_copy.index, df_test_copy['sarima_forecast_original'], label='SARIMA Forecast', color='orange')
plt.title("SARIMA(2,1,2)(1,1,1,12) Forecast on Original Series")
plt.xlabel("Date")
plt.ylabel("Sugar Price")
plt.legend()
plt.grid(True)
plt.show()

# Metrics
rmse_orig = np.sqrt(mean_squared_error(df_test_orig['price'], df_test_copy['sarima_forecast_original']))
mae_orig = mean_absolute_error(df_test_orig['price'], df_test_copy['sarima_forecast_original'])

print(f"📊 SARIMA on Original Price Series:\nRMSE: {rmse_orig:.4f}\nMAE : {mae_orig:.4f}")

# =====================================================
# 2️⃣ SARIMA on Rolling Mean Differenced Series
# =====================================================
print("\n🔁 Fitting SARIMA(2,1,2)(1,1,1,12) on rolling mean differenced series...")

sarima_model_roll = SARIMAX(
    df_train_roll,
    order=(2, 1, 2),
    seasonal_order=(1, 1, 1, 12),
    enforce_stationarity=False,
    enforce_invertibility=False
)

sarima_result_roll = sarima_model_roll.fit(disp=False)

# Forecast
start_idx_roll = len(df_train_roll)
end_idx_roll = start_idx_roll + len(df_test_roll) - 1
forecast_sarima_roll = sarima_result_roll.predict(start=start_idx_roll, end=end_idx_roll, dynamic=True)

# Save forecast in main df
forecast_index = df_test_roll.index
df_data.loc[forecast_index, 'sarima_forecast_diff'] = forecast_sarima_roll.values

# Plot rolling
plt.figure(figsize=(12, 6))
plt.plot(df_data['rolling_mean_diff'], label='Rolling Mean Diff', color='gray')
plt.plot(forecast_index, forecast_sarima_roll, label='SARIMA Forecast (Diff)', color='green')
plt.title("SARIMA(2,1,2)(1,1,1,12) Forecast on Rolling Mean Differenced Series")
plt.xlabel("Date")
plt.ylabel("Rolling Mean Differenced Price")
plt.legend()
plt.grid(True)
plt.show()

# Metrics
true_diff = df_data.loc[forecast_index, 'rolling_mean_diff'].dropna()
forecast_diff = df_data.loc[forecast_index, 'sarima_forecast_diff'].dropna()

rmse_roll = np.sqrt(mean_squared_error(true_diff, forecast_diff))
mae_roll = mean_absolute_error(true_diff, forecast_diff)

print(f"📊 SARIMA on Rolling Mean Differenced Series:\nRMSE: {rmse_roll:.4f}\nMAE : {mae_roll:.4f}")
🔁 Fitting SARIMA(2,1,2)(1,1,1,12) on original price series...
C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning:

No supported index is available. Prediction results will be given with an integer index beginning at `start`.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning:

No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.

No description has been provided for this image
📊 SARIMA on Original Price Series:
RMSE: 2.1080
MAE : 1.8513

🔁 Fitting SARIMA(2,1,2)(1,1,1,12) on rolling mean differenced series...
C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning:

No supported index is available. Prediction results will be given with an integer index beginning at `start`.

C:\Users\neeti\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning:

No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.

No description has been provided for this image
📊 SARIMA on Rolling Mean Differenced Series:
RMSE: 0.4574
MAE : 0.4364

📊 SARIMA Modeling Insights (Sugar Price Series)¶

🧪 1. Model Comparison Results¶

Model RMSE MAE
SARIMA on Original Series 2.1080 1.8513
SARIMA on Rolling Mean Differenced Series 0.4574 0.4364
  • ✅ Lower RMSE/MAE for rolling-mean differenced SARIMA suggests better short-term forecasting performance due to improved stationarity.
  • ⚠️ The original series, with trend and seasonality, caused higher forecast errors.

🔍 2. Seasonal Decomposition Analysis¶

  • Trend: Long-term upward trend in sugar prices, especially post-2008.
  • Seasonality: Clear annual cycle (12-month pattern) with consistent amplitude.
  • Residuals: Mostly stable, but contain occasional spikes (e.g., 2011–2012), indicating market shocks not captured by trend/seasonality.

💡 3. Next Steps¶

  • Consider using LSTM (Long Short-Term Memory neural networks):
    • Captures nonlinear and long-range dependencies.
    • Handles seasonality and trend without explicit decomposition.
    • More robust against irregular spikes and volatile fluctuations.
In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error, mean_absolute_error

# ===============================
# 🧹 Step 1: Prepare Data
# ===============================
df_data = df_country.copy()
df_data['rolling_mean'] = df_data['price'].rolling(window=12).mean()
df_data['rolling_mean_diff'] = df_data['rolling_mean'] - df_data['rolling_mean'].shift()
df_data.dropna(inplace=True)

# Split train/test
split_date = pd.to_datetime("2019-01-01")
df_train = df_data[df_data.index < split_date]
df_test = df_data[df_data.index >= split_date]

# ===============================
# 🔁 Step 2: Scaling
# ===============================
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(df_train[['price']])
test_scaled = scaler.transform(df_test[['price']])

# ===============================
# 🔁 Step 3: Create Sequences
# ===============================
def create_sequences(data, window):
    X, y = [], []
    for i in range(len(data) - window):
        X.append(data[i:i + window])
        y.append(data[i + window])
    return np.array(X), np.array(y)

window_size = 12
X_train, y_train = create_sequences(train_scaled, window_size)
X_test, y_test = create_sequences(test_scaled, window_size)

# Reshape for LSTM
X_train = X_train.reshape((X_train.shape[0], window_size, 1))
X_test = X_test.reshape((X_test.shape[0], window_size, 1))

# ===============================
# 🔮 Step 4: Build and Train LSTM
# ===============================
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(window_size, 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=50, batch_size=8, verbose=0)

# ===============================
# 🔮 Step 5: Forecast and Evaluate
# ===============================
y_pred_scaled = model.predict(X_test)
y_pred = scaler.inverse_transform(y_pred_scaled)
y_actual = scaler.inverse_transform(y_test)

# Metrics
rmse = np.sqrt(mean_squared_error(y_actual, y_pred))
mae = mean_absolute_error(y_actual, y_pred)

print(f"\n📊 LSTM Forecasting Results:\nRMSE: {rmse:.4f}\nMAE : {mae:.4f}")

# ===============================
# 📈 Step 6: Plot
# ===============================
forecast_index = df_test.index[window_size:]
plt.figure(figsize=(12, 6))
plt.plot(forecast_index, y_actual, label='Actual')
plt.plot(forecast_index, y_pred, label='LSTM Forecast', color='orange')
plt.title('LSTM Forecast on Monthly Sugar Price Series')
plt.xlabel('Date')
plt.ylabel('Price (INR)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
C:\Users\neeti\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:200: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

2/2 ━━━━━━━━━━━━━━━━━━━━ 0s 173ms/step

📊 LSTM Forecasting Results:
RMSE: 1.3163
MAE : 0.9080
No description has been provided for this image
In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Copy your monthly country-level data
df_data = df_country.copy()

# Create rolling mean and differenced series
df_data['rolling_mean'] = df_data['price'].rolling(window=12).mean()
df_data['rolling_mean_diff'] = df_data['rolling_mean'] - df_data['rolling_mean'].shift()
rolling_diff = df_data['rolling_mean_diff'].dropna()

# Train-test split
split_date = pd.to_datetime("2019-01-01")
df_train_roll = rolling_diff[rolling_diff.index < split_date]
df_test_roll = rolling_diff[rolling_diff.index >= split_date]
forecast_index = df_test_roll.index

# Scale the series
scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(df_train_roll.values.reshape(-1, 1))
scaled_test = scaler.transform(df_test_roll.values.reshape(-1, 1))

# Create sequences for LSTM
def create_sequences(data, seq_length=12):
    X, y = [], []
    for i in range(seq_length, len(data)):
        X.append(data[i-seq_length:i])
        y.append(data[i])
    return np.array(X), np.array(y)

seq_len = 12
X_train, y_train = create_sequences(scaled_train, seq_len)
X_test, y_test = create_sequences(scaled_test, seq_len)

# Reshape for LSTM [samples, time steps, features]
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Build and train the LSTM model
model_roll = Sequential()
model_roll.add(LSTM(50, activation='relu', input_shape=(seq_len, 1)))
model_roll.add(Dense(1))
model_roll.compile(optimizer='adam', loss='mse')
model_roll.fit(X_train, y_train, epochs=50, batch_size=16, verbose=0)

# Predict and inverse scale
y_pred_scaled = model_roll.predict(X_test)
y_pred_inv = scaler.inverse_transform(y_pred_scaled)

# Align forecast with index
forecast_index_adj = forecast_index[seq_len:]
df_data.loc[forecast_index_adj, 'lstm_forecast_diff'] = y_pred_inv.flatten()

# Plot results
plt.figure(figsize=(12, 6))
plt.plot(df_data['rolling_mean_diff'], label='Rolling Mean Diff', color='gray')
plt.plot(forecast_index_adj, y_pred_inv, label='LSTM Forecast (Diff)', color='teal')
plt.title("LSTM Forecast on Rolling Mean Differenced Series")
plt.xlabel("Date")
plt.ylabel("Rolling Mean Differenced Price")
plt.legend()
plt.grid(True)
plt.show()

# Evaluate model
true_diff = df_data.loc[forecast_index_adj, 'rolling_mean_diff'].dropna()
forecast_diff = df_data.loc[forecast_index_adj, 'lstm_forecast_diff'].dropna()

rmse_lstm_roll = np.sqrt(mean_squared_error(true_diff, forecast_diff))
mae_lstm_roll = mean_absolute_error(true_diff, forecast_diff)

print(f"📊 LSTM on Rolling Mean Differenced Series:\nRMSE: {rmse_lstm_roll:.4f}\nMAE : {mae_lstm_roll:.4f}")
C:\Users\neeti\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:200: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

2/2 ━━━━━━━━━━━━━━━━━━━━ 4s 189ms/step
No description has been provided for this image
📊 LSTM on Rolling Mean Differenced Series:
RMSE: 0.1026
MAE : 0.0723
In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Step 1: Copy your data
df_data = df_country.copy()

# Step 2: Create rolling mean and differenced series
df_data['rolling_mean'] = df_data['price'].rolling(window=12).mean()
df_data['rolling_mean_diff'] = df_data['rolling_mean'] - df_data['rolling_mean'].shift()
rolling_diff = df_data['rolling_mean_diff'].dropna()

# Step 3: Train-test split
split_date = pd.to_datetime("2019-01-01")
df_train_orig = df_data[df_data.index < split_date]
df_test_orig = df_data[df_data.index >= split_date]
df_test_copy = df_test_orig.copy()

# Step 4: Scaling original series
scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(df_train_orig[['price']])
scaled_test = scaler.transform(df_test_orig[['price']])

# Step 5: Create sequences for LSTM
def create_sequences(data, seq_length=12):
    X, y = [], []
    for i in range(seq_length, len(data)):
        X.append(data[i-seq_length:i])
        y.append(data[i])
    return np.array(X), np.array(y)

seq_len = 12
X_train, y_train = create_sequences(scaled_train, seq_len)
X_test, y_test = create_sequences(scaled_test, seq_len)

X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Step 6: Define and train LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(seq_len, 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=50, batch_size=16, verbose=0)

# Step 7: Forecast
y_pred = model.predict(X_test)
y_pred_inv = scaler.inverse_transform(y_pred)

# Step 8: Align results
df_test_copy = df_test_copy.iloc[seq_len:]
df_test_copy['lstm_forecast'] = y_pred_inv.flatten()

# Step 9: Plot
plt.figure(figsize=(12, 6))
plt.plot(df_data.index, df_data['price'], label='Actual Price', color='black')
plt.plot(df_test_copy.index, df_test_copy['lstm_forecast'], label='LSTM Forecast', color='purple')
plt.title("LSTM Forecast on Original Series")
plt.xlabel("Date")
plt.ylabel("Sugar Price")
plt.legend()
plt.grid(True)
plt.show()

# Step 10: Evaluate
rmse = np.sqrt(mean_squared_error(df_test_copy['price'], df_test_copy['lstm_forecast']))
mae = mean_absolute_error(df_test_copy['price'], df_test_copy['lstm_forecast'])

print(f"📊 LSTM Forecast (Original Series):\nRMSE: {rmse:.4f}\nMAE : {mae:.4f}")
C:\Users\neeti\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:200: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

WARNING:tensorflow:5 out of the last 5 calls to <function TensorFlowTrainer.make_predict_function.<locals>.one_step_on_data_distributed at 0x00000248D61F5E40> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
1/2 ━━━━━━━━━━━━━━━━━━━━ 0s 191ms/stepWARNING:tensorflow:6 out of the last 6 calls to <function TensorFlowTrainer.make_predict_function.<locals>.one_step_on_data_distributed at 0x00000248D61F5E40> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
2/2 ━━━━━━━━━━━━━━━━━━━━ 0s 201ms/step
No description has been provided for this image
📊 LSTM Forecast (Original Series):
RMSE: 1.2066
MAE : 0.7908
In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Copy your data
df_data = df_country.copy()

# Create rolling mean and diff
df_data['rolling_mean'] = df_data['price'].rolling(window=12).mean()
df_data['rolling_mean_diff'] = df_data['rolling_mean'] - df_data['rolling_mean'].shift()
rolling_diff = df_data['rolling_mean_diff'].dropna()

# Train-test split
split_date = pd.to_datetime("2019-01-01")
df_train_orig = df_data[df_data.index < split_date]
df_test_orig = df_data[df_data.index >= split_date]
df_test_copy = df_test_orig.copy()

df_train_roll = rolling_diff[rolling_diff.index < split_date]
df_test_roll = rolling_diff[rolling_diff.index >= split_date]

# --- FUNCTION FOR LSTM MODELING ---
def run_lstm_forecast(series, forecast_label='lstm_forecast', title='LSTM Forecast', ylabel='Price'):
    # Scale data
    scaler = MinMaxScaler()
    scaled_series = scaler.fit_transform(series.values.reshape(-1, 1))
    
    # Sequence creation
    def create_sequences(data, seq_length=12):
        X, y = [], []
        for i in range(seq_length, len(data)):
            X.append(data[i-seq_length:i])
            y.append(data[i])
        return np.array(X), np.array(y)

    seq_len = 12
    X, y = create_sequences(scaled_series, seq_len)
    X = X.reshape((X.shape[0], X.shape[1], 1))

    # Train-test split
    split_point = len(series[series.index < split_date]) - seq_len
    X_train, X_test = X[:split_point], X[split_point:]
    y_train, y_test = y[:split_point], y[split_point:]

    # Model
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(seq_len, 1)))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    model.fit(X_train, y_train, epochs=50, batch_size=16, verbose=0)

    # Forecast
    y_pred_scaled = model.predict(X_test)
    y_pred = scaler.inverse_transform(y_pred_scaled)
    y_true = scaler.inverse_transform(y_test)

    # Align index
    forecast_idx = series.index[seq_len+split_point:]
    forecast_df = pd.DataFrame(index=forecast_idx)
    forecast_df[forecast_label] = y_pred.flatten()
    forecast_df['true'] = y_true.flatten()
    forecast_df['residual'] = forecast_df['true'] - forecast_df[forecast_label]

    # Plot predictions
    plt.figure(figsize=(14, 6))
    plt.plot(series, label='Actual', color='black')
    plt.plot(forecast_df[forecast_label], label='LSTM Forecast', color='teal')
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel("Date")
    plt.grid(True)
    plt.legend()
    plt.show()

    # Plot residuals
    plt.figure(figsize=(14, 4))
    plt.plot(forecast_df['residual'], label='Residuals', color='red')
    plt.axhline(0, linestyle='--', color='gray')
    plt.title("LSTM Forecast Residuals")
    plt.ylabel("Residual")
    plt.grid(True)
    plt.show()

    # Metrics
    rmse = np.sqrt(mean_squared_error(forecast_df['true'], forecast_df[forecast_label]))
    mae = mean_absolute_error(forecast_df['true'], forecast_df[forecast_label])
    print(f"📊 {title}\nRMSE: {rmse:.4f}\nMAE : {mae:.4f}")

# ===============================
# Run LSTM on Original Price Series
# ===============================
run_lstm_forecast(
    series=df_data['price'].dropna(),
    forecast_label='lstm_forecast_original',
    title='LSTM Forecast on Original Price Series',
    ylabel='Sugar Price (INR)'
)

# ===============================
# Run LSTM on Rolling Mean Differenced Series
# ===============================
run_lstm_forecast(
    series=rolling_diff,
    forecast_label='lstm_forecast_diff',
    title='LSTM Forecast on Rolling Mean Differenced Series',
    ylabel='Rolling Mean Differenced Price'
)
C:\Users\neeti\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:200: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 102ms/step
No description has been provided for this image
No description has been provided for this image
📊 LSTM Forecast on Original Price Series
RMSE: 1.9501
MAE : 1.6564
C:\Users\neeti\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:200: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 87ms/step
No description has been provided for this image
No description has been provided for this image
📊 LSTM Forecast on Rolling Mean Differenced Series
RMSE: 0.1124
MAE : 0.0797
In [ ]: